其他
空气质量指数
#城市名
CITYNAME = 北京
CITY = beijing
MONTH = 2013-12
# 起始日期 年月
DATE_BEGIN = 2018/01
# 终止日期 年月
DATE_END = 2018/02
# 保存路径
SAVE_PATH = data/
class SpiderMain(object):
def __init__(self, conf):
self.conf = conf
self.dates = manager.DateManager()
self.downloader = html_downloader.HtmlDownloader(conf)
self.parser = html_parser.HtmlParser(conf)
self.output = html_output.HtmlOutput()
def craw(self):
count = 1
type_ = conf.get("MAIN", "SPIDER_TYPE")
df_dates = pd.date_range(start = conf.get(type_, "DATE_BEGIN"), end = conf.get(type_, "DATE_END"), freq = "M")
df_dates = list(df_dates)
df_dates.append(datetime.datetime.strptime(conf.get(type_, "DATE_END"), '%Y/%m'))
self.dates.add_new_dates(df_dates)
while self.dates.has_new_date():
try:
new_date = self.dates.get_new_date()
print("craw %d: %s %s" % (count, conf.get(type_, "CITY"), new_date.strftime("%Y-%m")))
html_cont = self.downloader.download(new_date)
#print html_cont
new_data = self.parser.parse(new_date, html_cont)
if new_data is None:
print("There are no data for this date")
count = count + 1
continue
#print new_urls
self.output.collect_data(new_data)
# if count == 3:
# break
count = count + 1
except Exception as e:
print(e); self.downloader.terminateBroswer()
print("craw failed")
sys.exit()
self.downloader.terminateBroswer()
self.output.save_excel(type_, conf.get("WEATHER", "SAVE_PATH") + type_ + "_" + conf.get(type_, "CITY") + ".csv")
if __name__ == '__main__':
conf = configparser.ConfigParser()
conf.read("conf.ini", encoding="utf-8-sig")
obj_spider = SpiderMain(conf)
obj_spider.craw()
3爬取结果
,Date,AQI,GRADE,PM25,PM10,SO2,CO,NO2,O3
0,2018-01-01,57,良,34,63,9,1,44,38
1,2018-01-02,50,优,28,50,7,0.8,33,46
2,2018-01-03,28,优,11,28,5,0.4,21,51
3,2018-01-04,40,优,15,30,4,0.5,32,39
4,2018-01-05,63,良,32,54,8,0.9,50,36
5,2018-01-06,48,优,16,30,5,0.5,38,48
6,2018-01-07,54,良,38,57,10,0.8,43,50
7,2018-01-08,55,良,12,60,4,0.3,11,61
8,2018-01-09,35,优,7,35,3,0.3,13,63
9,2018-01-10,32,优,6,21,3,0.3,9,63
10,2018-01-11,32,优,13,25,5,0.5,25,55
11,2018-01-12,87,良,61,81,11,1.3,69,21
12,2018-01-13,139,轻度污染,106,122,15,1.7,79,17
13,2018-01-14,176,中度污染,133,137,12,1.7,71,41
14,2018-01-15,59,良,32,58,8,0.9,47,42
15,2018-01-16,114,轻度污染,45,177,5,0.7,42,55
16,2018-01-17,77,良,42,104,8,1,49,62
17,2018-01-18,93,良,63,105,11,1.2,74,19
18,2018-01-19,108,轻度污染,81,122,13,1.5,73,36
19,2018-01-20,63,良,31,75,9,0.8,48,32
20,2018-01-21,64,良,37,77,19,1.2,44,29
21,2018-01-22,47,优,22,47,4,0.6,25,65
22,2018-01-23,32,优,11,32,4,0.4,15,60
23,2018-01-24,35,优,17,33,5,0.4,28,65
24,2018-01-25,33,优,9,23,4,0.4,26,62
25,2018-01-26,53,良,29,41,8,0.6,42,42
26,2018-01-27,112,轻度污染,84,95,14,1.4,67,14
27,2018-01-28,59,良,11,67,3,0.4,15,66
28,2018-01-29,56,良,8,61,3,0.4,20,67
29,2018-01-30,36,优,11,36,6,0.4,24,68
30,2018-01-31,62,良,22,74,7,0.5,29,65
31,2018-02-01,68,良,30,85,10,0.7,36,64
32,2018-02-02,34,优,8,30,5,0.3,11,68
33,2018-02-03,32,优,8,21,5,0.4,20,64
34,2018-02-04,40,优,19,36,8,0.5,32,55
35,2018-02-05,34,优,7,18,3,0.3,15,67
36,2018-02-06,65,良,47,61,13,0.9,51,43
37,2018-02-07,51,良,33,52,8,0.6,31,76
38,2018-02-08,73,良,53,76,12,0.9,53,49
39,2018-02-09,80,良,40,109,8,0.7,32,74
40,2018-02-10,44,优,10,44,3,0.3,12,74
41,2018-02-11,67,良,9,84,2,0.3,6,73
42,2018-02-12,53,良,12,56,4,0.4,15,73
43,2018-02-13,79,良,58,79,13,1.1,57,50
44,2018-02-14,37,优,13,33,3,0.4,18,74
45,2018-02-15,73,良,53,66,10,0.5,22,80
46,2018-02-16,107,轻度污染,80,101,14,0.6,24,85
47,2018-02-17,109,轻度污染,82,92,10,1.3,35,49
48,2018-02-18,152,中度污染,116,109,14,1.7,46,80
49,2018-02-19,188,中度污染,141,131,24,2,42,69
50,2018-02-20,63,良,45,42,6,0.7,21,85
51,2018-02-21,48,优,33,43,6,0.6,26,81
52,2018-02-22,45,优,18,45,6,0.5,29,77
53,2018-02-23,52,良,23,54,6,0.6,29,79
54,2018-02-24,62,良,30,74,10,0.6,21,65
55,2018-02-25,82,良,60,71,10,0.8,43,66
56,2018-02-26,162,中度污染,123,121,21,1.5,69,86
57,2018-02-27,218,重度污染,168,170,12,1.8,51,66
58,2018-02-28,115,轻度污染,87,116,9,1,34,74
4可视化一下此处主要用到了Selenium、BeautifulSoup、pandas、scrapy等Python package。
后台回复“空气质量”获取源码,有疑问后台留言。